{
 "cells": [
  {
   "cell_type": "markdown",
   "source": [
    "## 3.3.1 上下文和目标词"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": true,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:22:52.172437600Z",
     "start_time": "2023-05-05T09:22:52.158444500Z"
    }
   },
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "sys.path.append('..')\n",
    "from common.util import preprocess"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[0 1 2 3 4 1 5 6]\n"
     ]
    }
   ],
   "source": [
    "text = 'You say goodbye and I say hello.'\n",
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "print(corpus)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:22:52.582336200Z",
     "start_time": "2023-05-05T09:22:52.573333700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{0: 'you', 1: 'say', 2: 'goodbye', 3: 'and', 4: 'i', 5: 'hello', 6: '.'}\n"
     ]
    }
   ],
   "source": [
    "print(id_to_word)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:22:53.111087200Z",
     "start_time": "2023-05-05T09:22:53.093087200Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "outputs": [],
   "source": [
    "#生成上下文和目标词的函数\n",
    "def create_contexts_target(corpus, window_size=1):\n",
    "    target = corpus[window_size:-window_size]\n",
    "    contexts = []\n",
    "\n",
    "    for idx in range(window_size, len(corpus) - window_size):\n",
    "        cs = []\n",
    "        for t in range(-window_size, window_size + 1):\n",
    "            if t == 0:\n",
    "                continue\n",
    "            cs.append(corpus[idx + t])\n",
    "        contexts.append(cs)\n",
    "\n",
    "    return np.array(contexts), np.array(target)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:23:58.024145200Z",
     "start_time": "2023-05-05T09:23:58.003121Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "outputs": [],
   "source": [
    "contexts, target = create_contexts_target(corpus, window_size=1)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:23:58.395417600Z",
     "start_time": "2023-05-05T09:23:58.381418400Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[[0 2]\n",
      " [1 3]\n",
      " [2 4]\n",
      " [3 1]\n",
      " [4 5]\n",
      " [1 6]]\n"
     ]
    }
   ],
   "source": [
    "print(contexts)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:24:03.279358300Z",
     "start_time": "2023-05-05T09:24:03.275360700Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 2 3 4 1 5]\n"
     ]
    }
   ],
   "source": [
    "print(target)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:24:11.093440400Z",
     "start_time": "2023-05-05T09:24:11.089440600Z"
    }
   }
  },
  {
   "cell_type": "markdown",
   "source": [
    "## 3.3.2 转换为one-hot表示"
   ],
   "metadata": {
    "collapsed": false
   }
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "outputs": [],
   "source": [
    "import sys\n",
    "\n",
    "sys.path.append('..')\n",
    "from common.util import preprocess, create_contexts_target, convert_one_hot"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:25:43.100966400Z",
     "start_time": "2023-05-05T09:25:43.073965500Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "outputs": [],
   "source": [
    "text = 'You say goodbye and I say hello.'"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:25:49.391785500Z",
     "start_time": "2023-05-05T09:25:49.386785900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "outputs": [],
   "source": [
    "corpus, word_to_id, id_to_word = preprocess(text)\n",
    "contexts, target = create_contexts_target(corpus, window_size=1)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:26:01.420850200Z",
     "start_time": "2023-05-05T09:26:01.392821600Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "outputs": [],
   "source": [
    "vocab_size = len(word_to_id)\n",
    "target = convert_one_hot(target, vocab_size)\n",
    "contexts = convert_one_hot(contexts, vocab_size)"
   ],
   "metadata": {
    "collapsed": false,
    "ExecuteTime": {
     "end_time": "2023-05-05T09:26:13.278435600Z",
     "start_time": "2023-05-05T09:26:13.247436900Z"
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [],
   "source": [],
   "metadata": {
    "collapsed": false
   }
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
